EDA

employees <- read.csv("CaseStudy2-data.csv")

str(employees)
## 'data.frame':    870 obs. of  36 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
colSums(is.na(employees))
##                       ID                      Age                Attrition 
##                        0                        0                        0 
##           BusinessTravel                DailyRate               Department 
##                        0                        0                        0 
##         DistanceFromHome                Education           EducationField 
##                        0                        0                        0 
##            EmployeeCount           EmployeeNumber  EnvironmentSatisfaction 
##                        0                        0                        0 
##                   Gender               HourlyRate           JobInvolvement 
##                        0                        0                        0 
##                 JobLevel                  JobRole          JobSatisfaction 
##                        0                        0                        0 
##            MaritalStatus            MonthlyIncome              MonthlyRate 
##                        0                        0                        0 
##       NumCompaniesWorked                   Over18                 OverTime 
##                        0                        0                        0 
##        PercentSalaryHike        PerformanceRating RelationshipSatisfaction 
##                        0                        0                        0 
##            StandardHours         StockOptionLevel        TotalWorkingYears 
##                        0                        0                        0 
##    TrainingTimesLastYear          WorkLifeBalance           YearsAtCompany 
##                        0                        0                        0 
##       YearsInCurrentRole  YearsSinceLastPromotion     YearsWithCurrManager 
##                        0                        0                        0
summary(employees$MonthlyIncome)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1081    2840    4946    6390    8182   19999
summary(employees$Attrition)
##  No Yes 
## 730 140

Are any ordinal variables going in as continuous right now? yes

employees$JobInvolvement[which(employees$JobInvolvement == 1)] = 'Low'
employees$JobInvolvement[which(employees$JobInvolvement == 2)] = 'Medium'
employees$JobInvolvement[which(employees$JobInvolvement == 3)] = 'High'
employees$JobInvolvement[which(employees$JobInvolvement == 4)] = 'Very High'
employees$JobInvolvement = as.factor(employees$JobInvolvement)
summary(employees$JobInvolvement)
##      High       Low    Medium Very High 
##       514        47       228        81
employees$JobSatisfaction[which(employees$JobSatisfaction == 1)] = 'Low'
employees$JobSatisfaction[which(employees$JobSatisfaction == 2)] = 'Medium'
employees$JobSatisfaction[which(employees$JobSatisfaction == 3)] = 'High'
employees$JobSatisfaction[which(employees$JobSatisfaction == 4)] = 'Very High'
employees$JobSatisfaction = as.factor(employees$JobSatisfaction)
summary(employees$JobSatisfaction)
##      High       Low    Medium Very High 
##       254       179       166       271
employees$PerformanceRating[which(employees$PerformanceRating == 1)] = 'Low'
employees$PerformanceRating[which(employees$PerformanceRating == 2)] = 'Good'
employees$PerformanceRating[which(employees$PerformanceRating == 3)] = 'Excellent'
employees$PerformanceRating[which(employees$PerformanceRating == 4)] = 'Outstanding'
employees$PerformanceRating = as.factor(employees$PerformanceRating)
summary(employees$PerformanceRating)
##   Excellent Outstanding 
##         738         132
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 1)] = 'Low'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 2)] = 'Medium'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 3)] = 'High'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 4)] = 'Very High'
employees$RelationshipSatisfaction = as.factor(employees$RelationshipSatisfaction)
summary(employees$RelationshipSatisfaction)
##      High       Low    Medium Very High 
##       261       174       171       264
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 1)] = 'Bad'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 2)] = 'Good'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 3)] = 'Better'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 4)] = 'Best'
employees$WorkLifeBalance = as.factor(employees$WorkLifeBalance)
summary(employees$WorkLifeBalance)
##    Bad   Best Better   Good 
##     48     98    532    192

Highest job satisfaction vs education fields

library(ggmosaic)

ggplot(data = employees) +
   geom_mosaic(aes(x = product(JobSatisfaction, EducationField), fill=EducationField), na.rm=TRUE) + labs(x = "Education Field", title='f(JobSatisfaction | Education Field)', y='Job Satisfaction')

#Chi square says no significant difference in job satisfaction across education fields
table(employees$JobSatisfaction, employees$EducationField)
##            
##             Human Resources Life Sciences Marketing Medical Other
##   High                    4           103        27      80    14
##   Low                     3            75        21      52    11
##   Medium                  5            61        21      56     9
##   Very High               3           119        31      82    18
##            
##             Technical Degree
##   High                    26
##   Low                     17
##   Medium                  14
##   Very High               18
chisq.test(table(employees$JobSatisfaction, employees$EducationField))
## Warning in chisq.test(table(employees$JobSatisfaction,
## employees$EducationField)): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(employees$JobSatisfaction, employees$EducationField)
## X-squared = 7.1692, df = 15, p-value = 0.9528

What continuous variables have a difference in distribution of the response? pairs plots continuous

library(GGally)

#there's 22 continuous variables
employees %>%
  select_if(is.numeric) %>%
  dim()
## [1] 870  22
employees %>%
  select_if(is.numeric) %>%
  select(1:5) %>%
  mutate(Attrition = employees$Attrition) %>%
  sample_n(200) %>%
  ggpairs(aes(colour = Attrition)) + 
  ggtitle("Pairs Plot")

employees %>%
  select_if(is.numeric) %>%
  select(6:10) %>%
  mutate(Attrition = employees$Attrition) %>%
  sample_n(200) %>%
  ggpairs(aes(colour = Attrition)) + 
  ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

employees %>%
  select_if(is.numeric) %>%
  select(11:16) %>%
  mutate(Attrition = employees$Attrition) %>%
  sample_n(200) %>%
  ggpairs(aes(colour = Attrition)) + 
  ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

employees %>%
  select_if(is.numeric) %>%
  select(17:22) %>%
  mutate(Attrition = employees$Attrition) %>%
  sample_n(200) %>%
  ggpairs(aes(colour = Attrition)) + 
  ggtitle("Pairs Plot")

Check for multicollinearity between continuous variables

library(corrplot)
## corrplot 0.84 loaded
## corrplot 0.84 loaded
M <- employees %>%
  select_if(is.numeric) %>%
  cor()
## Warning in cor(.): the standard deviation is zero
corrplot(M, method = "circle")

What categorical variables have a difference in distribution in response?

#there's 14 categorical variables
employees %>%
  select_if(is.factor) %>%
  dim()
## [1] 870  14
categs <- names(select_if(employees, is.factor))

for(i in 1:length(categs)){
  print(employees %>%
    ggplot(aes(x = eval(parse(text=categs[i])), fill = Attrition)) +
    geom_bar(position = "fill") +
    xlab(categs[i])
  )
}

setting up train test split

split_train_test <- function(df) {
  # dataset with "no"
  data_no = df[which(df$Attrition=="No"),]
  # dataset with "yes"
  data_yes = df[which(df$Attrition=="Yes"),]
  
  #making more folds on No to balance the number with Yes 
  folds_no = createFolds(data_no$Attrition, k=8)
  folds_yes = createFolds(data_yes$Attrition, k=2)
  length(folds_no$Fold1)
  length(folds_no$Fold2)
  length(folds_yes$Fold1)
  length(folds_yes$Fold2)
  
  #Train
  train_no = data_no[folds_no$Fold1,]
  train_yes = data_yes[folds_yes$Fold1,]
  train = rbind(train_no, train_yes)
  
  #Test
  test_no = data_no[c(folds_no$Fold2, folds_no$Fold3, folds_no$Fold4, folds_no$Fold5),]
  test_yes = data_yes[folds_yes$Fold2,]
  test = rbind(test_no, test_yes)
  
  return(list(train, test))
}

Predict attrition - Naive Bayes

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(plotROC)

#naive bayes
#balanced training set
train1 <- split_train_test(employees)[[1]]
test1 <- split_train_test(employees)[[2]]
model.nb1 <- naiveBayes(Attrition ~ ., data=train1)
preds.nb1 <- predict(model.nb1, test1)
confusionMatrix(table(preds.nb1, test1$Attrition))
## Confusion Matrix and Statistics
## 
##          
## preds.nb1  No Yes
##       No  263  17
##       Yes 102  53
##                                           
##                Accuracy : 0.7264          
##                  95% CI : (0.6819, 0.7678)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3204          
##                                           
##  Mcnemar's Test P-Value : 1.358e-14       
##                                           
##             Sensitivity : 0.7205          
##             Specificity : 0.7571          
##          Pos Pred Value : 0.9393          
##          Neg Pred Value : 0.3419          
##              Prevalence : 0.8391          
##          Detection Rate : 0.6046          
##    Detection Prevalence : 0.6437          
##       Balanced Accuracy : 0.7388          
##                                           
##        'Positive' Class : No              
## 
preds.nb1 <- predict(model.nb1, test1, type = "raw")
preds.nb1 <- prediction(preds.nb1[,2], test1$Attrition)
roc.perf_1 = performance(preds.nb1, measure = "tpr", x.measure = "fpr")

#unbalanced training set
folds <- createFolds(employees$Attrition, k=2)
train2 <- employees[folds$Fold1,]
test2 <- employees[folds$Fold2,]
model.nb2 <- naiveBayes(Attrition ~ ., data=train2)
preds.nb2 <- predict(model.nb2, test2)
confusionMatrix(table(preds.nb2, test2$Attrition))
## Confusion Matrix and Statistics
## 
##          
## preds.nb2  No Yes
##       No  306  26
##       Yes  59  44
##                                           
##                Accuracy : 0.8046          
##                  95% CI : (0.7642, 0.8408)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.9762369       
##                                           
##                   Kappa : 0.3922          
##                                           
##  Mcnemar's Test P-Value : 0.0005187       
##                                           
##             Sensitivity : 0.8384          
##             Specificity : 0.6286          
##          Pos Pred Value : 0.9217          
##          Neg Pred Value : 0.4272          
##              Prevalence : 0.8391          
##          Detection Rate : 0.7034          
##    Detection Prevalence : 0.7632          
##       Balanced Accuracy : 0.7335          
##                                           
##        'Positive' Class : No              
## 
preds.nb2 <- predict(model.nb2, test2, type = "raw")
preds.nb2 <- prediction(preds.nb2[,2], test2$Attrition)
roc.perf_2 = performance(preds.nb2, measure = "tpr", x.measure = "fpr")

#the balanced one definitely looks better and .25 seems a good threshold
plot(roc.perf_1, col="red")
plot(roc.perf_2, add = TRUE, col="blue")

Predict attrition - KNN

#KNN
##load the package class
 library(class)

#resetting employees data set to original with ordinal columns because KNN only takes numeric
employees_num <- read.csv("CaseStudy2-data.csv")

employees_num <- employees_num %>%
  select_if(is.numeric) %>%
  mutate(Attrition = employees_num$Attrition)

train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]

##run knn k=8 and make ROC--it looks really bad
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=8, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
plot(roc.perf_knn, colorize = TRUE)

auc.knns <- c()
for(i in 1:80) {
#train test split
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]

##run knn
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=i, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
}
plot(x=1:80, y=auc.knns, xlab="k", main = "AUCs of KNN models with k=[1,80]")

#all the KNN models are bad

#add categorical variables 
employees_num <- read.csv("CaseStudy2-data.csv")

employees_num$BusinessTravel = as.numeric(employees_num$BusinessTravel)
employees_num$Department = as.numeric(employees_num$Department)
employees_num$EducationField = as.numeric(employees_num$EducationField)
employees_num$Gender = as.numeric(employees_num$Gender)
employees_num$JobRole = as.numeric(employees_num$JobRole)
employees_num$MaritalStatus = as.numeric(employees_num$MaritalStatus)
employees_num$OverTime = as.numeric(employees_num$OverTime)

employees_num <- employees_num %>%
  select_if(is.numeric) %>%
  mutate(Attrition = employees_num$Attrition)

train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]

train$Attrition = as.character(train$Attrition)
train$Attrition[which(train$Attrition == "No")] = 0
train$Attrition[which(train$Attrition == "Yes")] = 1
train$Attrition = as.numeric(train$Attrition)

test$Attrition = as.character(test$Attrition)
test$Attrition[which(test$Attrition == "No")] = 0
test$Attrition[which(test$Attrition == "Yes")] = 1
test$Attrition = as.numeric(test$Attrition)

str(train)
## 'data.frame':    162 obs. of  35 variables:
##  $ ID                      : int  4 30 42 91 100 104 117 122 124 129 ...
##  $ Age                     : int  32 52 45 38 34 23 25 21 38 43 ...
##  $ BusinessTravel          : num  3 1 3 2 3 3 3 3 3 3 ...
##  $ DailyRate               : int  801 715 954 1490 401 885 1356 501 243 930 ...
##  $ Department              : num  3 2 3 2 2 2 3 3 3 2 ...
##  $ DistanceFromHome        : int  1 19 2 2 1 4 10 5 7 6 ...
##  $ Education               : int  4 4 2 2 3 3 4 1 4 3 ...
##  $ EducationField          : num  3 4 6 2 2 4 2 4 3 4 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  2016 791 783 556 1447 705 1240 2021 709 1402 ...
##  $ EnvironmentSatisfaction : int  3 4 2 4 4 1 3 3 4 1 ...
##  $ Gender                  : num  1 2 2 2 1 2 2 2 1 1 ...
##  $ HourlyRate              : int  48 41 46 42 86 58 57 58 46 73 ...
##  $ JobInvolvement          : int  3 3 1 3 2 4 3 3 2 2 ...
##  $ JobLevel                : int  3 1 2 1 1 1 2 1 2 2 ...
##  $ JobRole                 : num  8 7 9 3 3 7 8 9 8 7 ...
##  $ JobSatisfaction         : int  4 4 3 4 2 1 4 1 4 3 ...
##  $ MaritalStatus           : num  2 2 3 2 2 2 3 3 3 3 ...
##  $ MonthlyIncome           : int  10422 4258 6632 1702 3294 2819 4950 2380 4028 4081 ...
##  $ MonthlyRate             : int  24032 26589 12388 12106 3708 8544 20623 25479 7791 20003 ...
##  $ NumCompaniesWorked      : int  1 0 0 1 5 2 0 1 0 1 ...
##  $ OverTime                : num  1 1 1 2 1 1 1 2 1 2 ...
##  $ PercentSalaryHike       : int  19 18 13 23 17 16 14 11 20 14 ...
##  $ PerformanceRating       : int  3 3 3 4 3 3 3 3 4 3 ...
##  $ RelationshipSatisfaction: int  3 1 1 3 1 1 2 4 1 1 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  2 1 0 1 1 1 0 0 0 0 ...
##  $ TotalWorkingYears       : int  14 5 9 1 7 5 5 2 8 20 ...
##  $ TrainingTimesLastYear   : int  3 3 3 3 2 3 4 6 2 3 ...
##  $ WorkLifeBalance         : int  3 3 3 3 2 4 3 3 3 1 ...
##  $ YearsAtCompany          : int  14 4 8 1 5 3 4 2 7 20 ...
##  $ YearsInCurrentRole      : int  10 3 7 0 4 2 3 2 7 7 ...
##  $ YearsSinceLastPromotion : int  5 1 3 0 0 0 1 1 0 1 ...
##  $ YearsWithCurrManager    : int  7 2 1 0 2 2 1 2 5 8 ...
##  $ Attrition               : num  0 0 0 0 0 0 0 0 0 0 ...
str(test)
## 'data.frame':    435 obs. of  35 variables:
##  $ ID                      : int  5 6 19 26 32 33 48 76 85 89 ...
##  $ Age                     : int  24 27 34 44 33 46 38 30 36 27 ...
##  $ BusinessTravel          : num  2 2 3 3 3 3 3 2 3 2 ...
##  $ DailyRate               : int  567 294 181 1099 267 488 770 721 429 994 ...
##  $ Department              : num  2 2 2 3 2 3 3 2 2 3 ...
##  $ DistanceFromHome        : int  2 10 2 5 21 2 10 1 2 8 ...
##  $ Education               : int  1 2 4 3 3 3 4 2 4 3 ...
##  $ EducationField          : num  6 2 4 3 4 6 3 4 2 2 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1646 733 1755 1267 1698 363 1119 57 1294 56 ...
##  $ EnvironmentSatisfaction : int  1 4 4 2 2 3 3 3 3 4 ...
##  $ Gender                  : num  1 2 2 2 2 1 2 1 1 2 ...
##  $ HourlyRate              : int  32 32 97 88 79 75 73 58 53 37 ...
##  $ JobInvolvement          : int  3 3 4 3 4 1 2 3 3 3 ...
##  $ JobLevel                : int  1 3 1 5 1 4 3 2 2 3 ...
##  $ JobRole                 : num  7 5 7 4 3 4 8 3 5 8 ...
##  $ JobSatisfaction         : int  4 1 4 2 2 2 3 4 2 3 ...
##  $ MaritalStatus           : num  3 1 2 2 2 2 1 3 3 3 ...
##  $ MonthlyIncome           : int  3760 8793 2932 18213 2028 16872 8740 4011 5410 8726 ...
##  $ MonthlyRate             : int  17218 4809 5586 8751 13637 14977 5569 10781 2323 2975 ...
##  $ NumCompaniesWorked      : int  1 1 0 7 1 3 0 1 9 1 ...
##  $ OverTime                : num  2 1 2 1 1 2 2 1 2 1 ...
##  $ PercentSalaryHike       : int  13 21 14 11 18 12 14 23 11 15 ...
##  $ PerformanceRating       : int  3 4 3 3 3 3 3 4 3 3 ...
##  $ RelationshipSatisfaction: int  3 3 1 3 4 2 2 4 4 4 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  0 2 3 1 3 1 2 0 0 0 ...
##  $ TotalWorkingYears       : int  6 9 6 26 14 28 9 12 18 9 ...
##  $ TrainingTimesLastYear   : int  2 4 3 5 6 2 2 2 2 0 ...
##  $ WorkLifeBalance         : int  3 2 3 3 3 2 3 3 3 3 ...
##  $ YearsAtCompany          : int  6 9 5 22 14 7 8 12 16 9 ...
##  $ YearsInCurrentRole      : int  3 7 0 9 11 7 7 8 14 8 ...
##  $ YearsSinceLastPromotion : int  1 1 1 3 2 7 2 3 5 1 ...
##  $ YearsWithCurrManager    : int  3 7 2 10 13 7 7 7 12 7 ...
##  $ Attrition               : num  0 0 0 0 0 0 0 0 0 0 ...
##run knn k=18 and make ROC
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=as.factor(train$Attrition), k=18, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
plot(roc.perf_knn, colorize = TRUE)

auc.knns <- c()
for(i in 1:80) {
#train test split
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]

##plot auc of knn for many ks
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=i, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
}
plot(x=1:80, y=auc.knns, xlab="k", main = "AUCs of KNN models with k=[1,80]")

#they're all still bad
#RF
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
train <- split_train_test(employees)[[1]]
test <- split_train_test(employees)[[2]]

model.rf <- randomForest(Attrition ~ ., data = train, importance = TRUE)
preds.rf <- predict(model.rf, test)

confusionMatrix(table(preds.rf, test$Attrition), positive = "Yes")
## Confusion Matrix and Statistics
## 
##         
## preds.rf  No Yes
##      No  314  12
##      Yes  51  58
##                                           
##                Accuracy : 0.8552          
##                  95% CI : (0.8185, 0.8869)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.1993          
##                                           
##                   Kappa : 0.5623          
##                                           
##  Mcnemar's Test P-Value : 1.688e-06       
##                                           
##             Sensitivity : 0.8286          
##             Specificity : 0.8603          
##          Pos Pred Value : 0.5321          
##          Neg Pred Value : 0.9632          
##              Prevalence : 0.1609          
##          Detection Rate : 0.1333          
##    Detection Prevalence : 0.2506          
##       Balanced Accuracy : 0.8444          
##                                           
##        'Positive' Class : Yes             
## 
prob.rf <- predict(model.rf, test, type = "prob")
preds.rf <- prediction(prob.rf[,2], test$Attrition)
roc.rf = performance(preds.rf, measure = "tpr", x.measure = "fpr")

#the balanced one definitely looks better and .25 seems a good threshold
plot(roc.perf_1, col="red")
plot(roc.rf, add = TRUE, col="blue")

Random forest wins!

Regression time

What continuous variables have a difference in distribution of the Monthly Income? pairs plots continuous

library(GGally)

employees <- read.csv("CaseStudy2-data.csv")
employees$Over18 <- NULL

#there's 22 numeric variables
employees %>%
  select_if(is.numeric) %>%
  dim()
## [1] 870  27
employees %>%
  ggplot(aes(x=MonthlyIncome)) +
  geom_histogram()

employees %>%
  select_if(is.numeric) %>%
  select(1:5) %>%
  mutate(MonthlyIncome = employees$MonthlyIncome) %>%
  sample_n(200) %>%
  ggpairs() + 
  ggtitle("Pairs Plot")

employees %>%
  select_if(is.numeric) %>%
  select(6:10) %>%
  mutate(MonthlyIncome = employees$MonthlyIncome) %>%
  sample_n(200) %>%
  ggpairs() + 
  ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

employees %>%
  select_if(is.numeric) %>%
  select(11:15) %>%
  mutate(MonthlyIncome = employees$MonthlyIncome) %>%
  sample_n(200) %>%
  ggpairs() + 
  ggtitle("Pairs Plot")

employees %>%
  select_if(is.numeric) %>%
  select(16:20) %>%
  mutate(MonthlyIncome = employees$MonthlyIncome) %>%
  sample_n(200) %>%
  ggpairs() + 
  ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

employees %>%
  select_if(is.numeric) %>%
  select(21:27) %>%
  mutate(MonthlyIncome = employees$MonthlyIncome) %>%
  sample_n(200) %>%
  ggpairs() + 
  ggtitle("Pairs Plot")

employees$logMonthlyIncome = log(employees$MonthlyIncome)

Predicting Salary – linear Regression

str(employees)
## 'data.frame':    870 obs. of  36 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
##  $ logMonthlyIncome        : num  8.39 9.88 9.14 9.25 8.23 ...
model.lm <- lm(logMonthlyIncome ~ ., data=employees)
plot(model.lm)